import libraries
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import sklearn
In [2]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
In [3]:
df = pd.read_csv('./Data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()
Out[3]:
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
In [5]:
df.describe()
Out[5]:
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows × 26 columns
In [6]:
df.columns
print(df.columns)
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
employee attrition value
In [7]:
df['Attrition'].value_counts().plot(kind='bar', backend='plotly', title='Attrition')
employee attrition %
In [8]:
attrition_counts = df["Attrition"].value_counts()
yes_percent = attrition_counts["Yes"] / (
attrition_counts["Yes"] + attrition_counts["No"]
)
no_percent = attrition_counts["No"] / (attrition_counts["Yes"] + attrition_counts["No"])
fig = go.Figure(
data=[
go.Pie(
labels=["Yes", "No"],
values=[yes_percent, no_percent],
),
]
)
fig.show()
gender analysis
In [9]:
# average age of genders in the dataset
average_age_by_gender = df.groupby('Gender')['Age'].mean()
print(average_age_by_gender)
Gender Female 37.329932 Male 36.653061 Name: Age, dtype: float64
age distribution
In [10]:
from plotly.subplots import make_subplots
# Create subplots: 2 rows, 2 columns
fig = make_subplots(rows=2, cols=2, subplot_titles=("Female Employees", "Male Employees", "Overall Employees"))
# Female employees
female_df = df[df['Gender'] == 'Female']
female_hist = px.histogram(female_df, x='Age')
female_hist.update_traces(showlegend=True)
mean_age_female = female_df['Age'].mean()
# Male employees
male_df = df[df['Gender'] == 'Male']
male_hist = px.histogram(male_df, x='Age')
male_hist.update_traces(showlegend=True)
mean_age_male = male_df['Age'].mean()
# Overall employees
overall_hist = px.histogram(df, x='Age')
overall_hist.update_traces(showlegend=True)
mean_age_overall = df['Age'].mean()
# Add traces to subplots
for trace in female_hist.data:
fig.add_trace(trace, row=1, col=1)
fig.update_xaxes(title_text="Age", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.add_vline(x=mean_age_female, line_width=2, line_dash="dash", line_color="red", annotation_text="Mean Age", annotation_position="top right", row=1, col=1)
for trace in male_hist.data:
fig.add_trace(trace, row=1, col=2)
fig.update_xaxes(title_text="Age", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.add_vline(x=mean_age_male, line_width=2, line_dash="dash", line_color="red", annotation_text="Mean Age", annotation_position="top right", row=1, col=2)
for trace in overall_hist.data:
fig.add_trace(trace, row=2, col=1)
fig.update_xaxes(title_text="Age", row=2, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)
fig.add_vline(x=mean_age_overall, line_width=2, line_dash="dash", line_color="red", annotation_text="Mean Age", annotation_position="top right", row=2, col=1)
# Update layout
fig.update_layout(title_text="Age Distribution of Employees", height=800)
fig.show()
distribution of job satisfaction
In [11]:
df.groupby(['Gender', 'JobSatisfaction'])['JobSatisfaction'].count()
Out[11]:
Gender JobSatisfaction
Female 1 119
2 118
3 181
4 170
Male 1 170
2 162
3 261
4 289
Name: JobSatisfaction, dtype: int64
In [12]:
# Group by Gender and JobSatisfaction and count the occurrences
grouped_df = (
df.groupby(["Gender", "JobSatisfaction"])["JobSatisfaction"]
.count()
.reset_index(name="Count")
)
# Create a bar plot
fig = px.bar(
grouped_df,
x="JobSatisfaction",
y="Count",
color="Gender",
barmode="group",
title="Job Satisfaction by Gender",
)
fig.show()
Monthly Income by Gender
In [13]:
fig = px.strip(
df,
x='Gender',
y='MonthlyIncome',
title='Disparity in Monthly Income',
hover_data=['MonthlyIncome', 'JobSatisfaction'],
color='Gender'
)
fig.show()
Average Income and Presence by Department
In [14]:
# Calculate the mean monthly income by gender
mean_income = df.groupby('Gender')['MonthlyIncome'].mean().reset_index()
# Create a bar plot
fig = px.bar(mean_income, x='Gender', y='MonthlyIncome', title='Average Monthly Income by Gender', color='Gender')
fig.show()
In [15]:
grouped_df = df.groupby(['Gender', 'Department'])['Department'].count().reset_index(name='Count')
# Create a bar plot
fig = px.bar(
grouped_df,
x="Department",
y="Count",
color="Gender",
barmode="group",
title="Number of Employees by Department",
)
fig.show()